library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
library(readr)
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.4-8, (SVN revision 845)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
##  Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
##  GDAL binary built with GEOS: FALSE 
##  Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
##  Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
##  Linking to sp version: 1.3-2
library(sp)
library(tmap)
library(leaflet)
library(ggplot2)
library(ggthemes)

New York’s Inequality Meets COVID-19

Project Objective

  • Visualize COVID-19 positive cases at zipcode level, and explore if there is any correlation with demographic features such as median income, educational attainment, and race.
  • Select key zipcodes with the highest and lowest positive cases per 1,000 residents and explore if we can find interesting observations in terms of daily growth rate –> can this mean that certion regions are not practicing social distancing enough?

Data Visualizations

  1. 1 chloropleth map to show positive cases per 1,000 residents at zipcode level —> hover label to show education attainment, income (i.e., % of population below the poverty line), and race (% of Black residents)
  2. scatterplot to show corrleation betwen COVID-19 & income level (median income at zip code level), group it by borough
  3. line chart to see growth trajectory of zey areas

Loading Data

#corona data 
summary <- read.csv("summary.csv")
boro <- read.csv("boro.csv")
daily <- read.csv("daily.csv")
case <- read.csv("case-hosp-death.csv")
test_zip <- read.csv("tests_zip.csv") 

#Combined data set of COVID-19 cases and demographics data at zipcode level
#demographics data at zipcode level was attained through the uszipcodes API and datasets 
#were combined in Python 
zip_demo <- read.csv("zip_with_demo_v2.csv")
zip_demo <- zip_demo %>% rename("ZIPCODE" = "MODZCTA")
zip_demo2 <- zip_demo %>% mutate(CasesPer1000 = (Positive * 1000) / Total)
zip_median <- read.csv("final.csv")
zip_median <- zip_median %>% select("MODZCTA", "median_household_income")
zip_median <- zip_median %>% rename("ZIPCODE" = "MODZCTA")
zip_demo2 <- left_join(zip_demo2, zip_median, by = "ZIPCODE")

Choloropleth map to show positive caes by zipcode

nyc <- readOGR(dsn = "/Users/sunghwapark/Desktop/DataVisFinalProj/ZIP_CODE_040114", layer = "ZIP_CODE_040114")
## OGR data source with driver: ESRI Shapefile 
## Source: "/Users/sunghwapark/Desktop/DataVisFinalProj/ZIP_CODE_040114", layer: "ZIP_CODE_040114"
## with 263 features
## It has 12 fields
#class(nyc)
#str(nyc, max.level=2)
nyc2 <- nyc

Static Map for Positive Cases

zip_demo2$ZIPCODE <- as.character(zip_demo2$ZIPCODE)
nyc2@data <- nyc2@data %>% 
  left_join(zip_demo2, by = "ZIPCODE")
## Warning: Column `ZIPCODE` joining factor and character vector, coercing
## into character vector
#popup = c("Zipcode"="ZIPCODE", "COVID-19 Case"= "Positive")
tm <- tm_shape(nyc2) + tm_fill("Positive", palette = "BuPu") + tm_borders(alpha=.5, col = "white")
tm

Static Map for Positive Cases per 1,000 Residents

tm2 <- tm_shape(nyc2) + tm_fill("CasesPer1000", palette = "Blues") + tm_borders(alpha=.5, col = "white")
tm2

Interactive Map of Positive Cases

tm_leaflet <- tmap_leaflet(tm)
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
m <- tm_leaflet %>% addTiles('http://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}.png')

content <- paste("Zipcode:",nyc2@data$ZIPCODE, "<br/>",
                 "COVID-19 CASES:",nyc2@data$Positive,"<br/>",
                 "Demographics:","<br/>", 
                 round(nyc2@data$population_by_race.Black.Or.African.American*100, 1), "of residents are Black", "<br/>", 
                 round(nyc2@data$household_income....25.000*100, 1), "of people live below the poverty line", "<br/>", 
                 round(nyc2@data$educational_attainment_for_population_25_and_over.Less.Than.High.School.Diploma*100, 1), "of people have less than high school diploma")

m
tm_leaflet3 <- tmap_leaflet(tm)
tm_leaflet3$x$calls[[5]]$args[[7]] <- leaflet:::evalFormula(
  ~content,
  data=nyc2
)
tm_leaflet3$x$calls[[5]]$args[[9]] = highlightOptions = highlightOptions( 
          color='#000000', weight = 3, 
          bringToFront = TRUE, sendToBack = TRUE)
tm_leaflet3 %>% addTiles('http://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}.png')

Scatterplot to visualize correlation between COVID19 and income

nycdata <- nyc2@data
nycdata2 <- nyc2@data
nycdata2 <- nycdata2 %>% mutate(population_by_race_nonWhite = (1 - population_by_race.White))
nycdata2$positive_per_100k <- as.numeric(as.character(nycdata2$positive_per_100k)) 
nycdata2$median_household_income <- as.numeric(as.character(nycdata2$median_household_income)) 
gg <- ggplot(nycdata2, aes(x = median_household_income, y = positive_per_100k, color = 
                             -educational_attainment_for_population_25_and_over.High.School.Graduate)) + xlim(20000, 180000) + geom_point(size = 2, alpha = 1) + geom_smooth(aes(group = 1),
                 method = "lm",
                 se = FALSE, 
                 color = "lightpink3") + theme_economist()
gg
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 76 rows containing non-finite values (stat_smooth).
## Warning: Removed 76 rows containing missing values (geom_point).